Connecting to Database



In [6]:

    
import pandas as pd
import numpy as np
terror = pd.read_csv('file.csv', encoding='ISO-8859-1')
cleanedforuse = terror.filter(['imonth', 'iday', 'region','property','propextent','attacktype1','weaptype1','nperps','success','multiple','specificity'])
final = cleanedforuse[~np.isnan(cleanedforuse).any(axis=1)]









    



/Users/chloe/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (4,61,62,66,116,117,123) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [7]:

    
final.head()









    Out[7]:






  
    
      
      imonth
      iday
      region
      property
      propextent
      attacktype1
      weaptype1
      nperps
      success
      multiple
      specificity
    
  
  
    
      5
      1
      1
      1
      1
      3.0
      2
      5
      -99.0
      1
      0
      1
    
    
      7
      1
      2
      1
      1
      3.0
      3
      6
      -99.0
      1
      0
      1
    
    
      8
      1
      2
      1
      1
      3.0
      7
      8
      1.0
      1
      0
      1
    
    
      9
      1
      3
      1
      1
      3.0
      7
      8
      1.0
      1
      0
      1
    
    
      11
      1
      6
      1
      1
      3.0
      7
      8
      -99.0
      1
      0
      1



In [8]:

    
import sqlite3
conn = sqlite3.connect('Terrorisks.db')



In [9]:

    
final.to_sql('final',con=conn, flavor='sqlite', if_exists='replace')









    



/Users/chloe/anaconda/lib/python3.6/site-packages/pandas/io/sql.py:525: FutureWarning: the 'flavor' parameter is deprecated and will be removed in a future version, as 'sqlite' is the only supported option when SQLAlchemy is not installed.
  _validate_flavor_parameter(flavor)



In [10]:

    
df = pd.read_sql_query('SELECT * FROM final', conn)



In [11]:

    
df.head(10)









    Out[11]:






  
    
      
      index
      imonth
      iday
      region
      property
      propextent
      attacktype1
      weaptype1
      nperps
      success
      multiple
      specificity
    
  
  
    
      0
      5
      1
      1
      1
      1
      3.0
      2
      5
      -99.0
      1
      0
      1
    
    
      1
      7
      1
      2
      1
      1
      3.0
      3
      6
      -99.0
      1
      0
      1
    
    
      2
      8
      1
      2
      1
      1
      3.0
      7
      8
      1.0
      1
      0
      1
    
    
      3
      9
      1
      3
      1
      1
      3.0
      7
      8
      1.0
      1
      0
      1
    
    
      4
      11
      1
      6
      1
      1
      3.0
      7
      8
      -99.0
      1
      0
      1
    
    
      5
      13
      1
      9
      1
      1
      3.0
      7
      8
      -99.0
      1
      0
      1
    
    
      6
      14
      1
      9
      1
      1
      2.0
      7
      8
      -99.0
      1
      0
      1
    
    
      7
      17
      1
      12
      1
      1
      3.0
      3
      6
      -99.0
      1
      0
      1
    
    
      8
      18
      1
      12
      1
      -9
      4.0
      3
      6
      -99.0
      1
      0
      1
    
    
      9
      19
      1
      13
      1
      1
      3.0
      7
      8
      -99.0
      1
      0
      1

LOGISTIC REGRESSION



In [12]:

    
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_curve, auc









    



/Users/chloe/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [14]:

    
y, X = dmatrices('success ~ C(imonth) + C(iday) + region + C(property) + C(propextent) + C(attacktype1) + C(weaptype1)+ C(nperps) + specificity', df, return_type="dataframe")
print(X)









    



       Intercept  C(imonth)[T.2]  C(imonth)[T.3]  C(imonth)[T.4]  \
0            1.0             0.0             0.0             0.0   
1            1.0             0.0             0.0             0.0   
2            1.0             0.0             0.0             0.0   
3            1.0             0.0             0.0             0.0   
4            1.0             0.0             0.0             0.0   
5            1.0             0.0             0.0             0.0   
6            1.0             0.0             0.0             0.0   
7            1.0             0.0             0.0             0.0   
8            1.0             0.0             0.0             0.0   
9            1.0             0.0             0.0             0.0   
10           1.0             0.0             0.0             0.0   
11           1.0             0.0             0.0             0.0   
12           1.0             0.0             0.0             0.0   
13           1.0             0.0             0.0             0.0   
14           1.0             0.0             0.0             0.0   
15           1.0             0.0             0.0             0.0   
16           1.0             0.0             0.0             0.0   
17           1.0             0.0             0.0             0.0   
18           1.0             0.0             0.0             0.0   
19           1.0             0.0             0.0             0.0   
20           1.0             0.0             0.0             0.0   
21           1.0             0.0             0.0             0.0   
22           1.0             0.0             0.0             0.0   
23           1.0             0.0             0.0             0.0   
24           1.0             1.0             0.0             0.0   
25           1.0             1.0             0.0             0.0   
26           1.0             1.0             0.0             0.0   
27           1.0             1.0             0.0             0.0   
28           1.0             1.0             0.0             0.0   
29           1.0             1.0             0.0             0.0   
...          ...             ...             ...             ...   
35460        1.0             0.0             0.0             0.0   
35461        1.0             0.0             0.0             0.0   
35462        1.0             0.0             0.0             0.0   
35463        1.0             0.0             0.0             0.0   
35464        1.0             0.0             0.0             0.0   
35465        1.0             0.0             0.0             0.0   
35466        1.0             0.0             0.0             0.0   
35467        1.0             0.0             0.0             0.0   
35468        1.0             0.0             0.0             0.0   
35469        1.0             0.0             0.0             0.0   
35470        1.0             0.0             0.0             0.0   
35471        1.0             0.0             0.0             0.0   
35472        1.0             0.0             0.0             0.0   
35473        1.0             0.0             0.0             0.0   
35474        1.0             0.0             0.0             0.0   
35475        1.0             0.0             0.0             0.0   
35476        1.0             0.0             0.0             0.0   
35477        1.0             0.0             0.0             0.0   
35478        1.0             0.0             0.0             0.0   
35479        1.0             0.0             0.0             0.0   
35480        1.0             0.0             0.0             0.0   
35481        1.0             0.0             0.0             0.0   
35482        1.0             0.0             0.0             0.0   
35483        1.0             0.0             0.0             0.0   
35484        1.0             0.0             0.0             0.0   
35485        1.0             0.0             0.0             0.0   
35486        1.0             0.0             0.0             0.0   
35487        1.0             0.0             0.0             0.0   
35488        1.0             0.0             0.0             0.0   
35489        1.0             0.0             0.0             0.0   

       C(imonth)[T.5]  C(imonth)[T.6]  C(imonth)[T.7]  C(imonth)[T.8]  \
0                 0.0             0.0             0.0             0.0   
1                 0.0             0.0             0.0             0.0   
2                 0.0             0.0             0.0             0.0   
3                 0.0             0.0             0.0             0.0   
4                 0.0             0.0             0.0             0.0   
5                 0.0             0.0             0.0             0.0   
6                 0.0             0.0             0.0             0.0   
7                 0.0             0.0             0.0             0.0   
8                 0.0             0.0             0.0             0.0   
9                 0.0             0.0             0.0             0.0   
10                0.0             0.0             0.0             0.0   
11                0.0             0.0             0.0             0.0   
12                0.0             0.0             0.0             0.0   
13                0.0             0.0             0.0             0.0   
14                0.0             0.0             0.0             0.0   
15                0.0             0.0             0.0             0.0   
16                0.0             0.0             0.0             0.0   
17                0.0             0.0             0.0             0.0   
18                0.0             0.0             0.0             0.0   
19                0.0             0.0             0.0             0.0   
20                0.0             0.0             0.0             0.0   
21                0.0             0.0             0.0             0.0   
22                0.0             0.0             0.0             0.0   
23                0.0             0.0             0.0             0.0   
24                0.0             0.0             0.0             0.0   
25                0.0             0.0             0.0             0.0   
26                0.0             0.0             0.0             0.0   
27                0.0             0.0             0.0             0.0   
28                0.0             0.0             0.0             0.0   
29                0.0             0.0             0.0             0.0   
...               ...             ...             ...             ...   
35460             0.0             0.0             0.0             0.0   
35461             0.0             0.0             0.0             0.0   
35462             0.0             0.0             0.0             0.0   
35463             0.0             0.0             0.0             0.0   
35464             0.0             0.0             0.0             0.0   
35465             0.0             0.0             0.0             0.0   
35466             0.0             0.0             0.0             0.0   
35467             0.0             0.0             0.0             0.0   
35468             0.0             0.0             0.0             0.0   
35469             0.0             0.0             0.0             0.0   
35470             0.0             0.0             0.0             0.0   
35471             0.0             0.0             0.0             0.0   
35472             0.0             0.0             0.0             0.0   
35473             0.0             0.0             0.0             0.0   
35474             0.0             0.0             0.0             0.0   
35475             0.0             0.0             0.0             0.0   
35476             0.0             0.0             0.0             0.0   
35477             0.0             0.0             0.0             0.0   
35478             0.0             0.0             0.0             0.0   
35479             0.0             0.0             0.0             0.0   
35480             0.0             0.0             0.0             0.0   
35481             0.0             0.0             0.0             0.0   
35482             0.0             0.0             0.0             0.0   
35483             0.0             0.0             0.0             0.0   
35484             0.0             0.0             0.0             0.0   
35485             0.0             0.0             0.0             0.0   
35486             0.0             0.0             0.0             0.0   
35487             0.0             0.0             0.0             0.0   
35488             0.0             0.0             0.0             0.0   
35489             0.0             0.0             0.0             0.0   

       C(imonth)[T.9]  C(imonth)[T.10]     ...       C(nperps)[T.800.0]  \
0                 0.0              0.0     ...                      0.0   
1                 0.0              0.0     ...                      0.0   
2                 0.0              0.0     ...                      0.0   
3                 0.0              0.0     ...                      0.0   
4                 0.0              0.0     ...                      0.0   
5                 0.0              0.0     ...                      0.0   
6                 0.0              0.0     ...                      0.0   
7                 0.0              0.0     ...                      0.0   
8                 0.0              0.0     ...                      0.0   
9                 0.0              0.0     ...                      0.0   
10                0.0              0.0     ...                      0.0   
11                0.0              0.0     ...                      0.0   
12                0.0              0.0     ...                      0.0   
13                0.0              0.0     ...                      0.0   
14                0.0              0.0     ...                      0.0   
15                0.0              0.0     ...                      0.0   
16                0.0              0.0     ...                      0.0   
17                0.0              0.0     ...                      0.0   
18                0.0              0.0     ...                      0.0   
19                0.0              0.0     ...                      0.0   
20                0.0              0.0     ...                      0.0   
21                0.0              0.0     ...                      0.0   
22                0.0              0.0     ...                      0.0   
23                0.0              0.0     ...                      0.0   
24                0.0              0.0     ...                      0.0   
25                0.0              0.0     ...                      0.0   
26                0.0              0.0     ...                      0.0   
27                0.0              0.0     ...                      0.0   
28                0.0              0.0     ...                      0.0   
29                0.0              0.0     ...                      0.0   
...               ...              ...     ...                      ...   
35460             0.0              0.0     ...                      0.0   
35461             0.0              0.0     ...                      0.0   
35462             0.0              0.0     ...                      0.0   
35463             0.0              0.0     ...                      0.0   
35464             0.0              0.0     ...                      0.0   
35465             0.0              0.0     ...                      0.0   
35466             0.0              0.0     ...                      0.0   
35467             0.0              0.0     ...                      0.0   
35468             0.0              0.0     ...                      0.0   
35469             0.0              0.0     ...                      0.0   
35470             0.0              0.0     ...                      0.0   
35471             0.0              0.0     ...                      0.0   
35472             0.0              0.0     ...                      0.0   
35473             0.0              0.0     ...                      0.0   
35474             0.0              0.0     ...                      0.0   
35475             0.0              0.0     ...                      0.0   
35476             0.0              0.0     ...                      0.0   
35477             0.0              0.0     ...                      0.0   
35478             0.0              0.0     ...                      0.0   
35479             0.0              0.0     ...                      0.0   
35480             0.0              0.0     ...                      0.0   
35481             0.0              0.0     ...                      0.0   
35482             0.0              0.0     ...                      0.0   
35483             0.0              0.0     ...                      0.0   
35484             0.0              0.0     ...                      0.0   
35485             0.0              0.0     ...                      0.0   
35486             0.0              0.0     ...                      0.0   
35487             0.0              0.0     ...                      0.0   
35488             0.0              0.0     ...                      0.0   
35489             0.0              0.0     ...                      0.0   

       C(nperps)[T.900.0]  C(nperps)[T.1000.0]  C(nperps)[T.1200.0]  \
0                     0.0                  0.0                  0.0   
1                     0.0                  0.0                  0.0   
2                     0.0                  0.0                  0.0   
3                     0.0                  0.0                  0.0   
4                     0.0                  0.0                  0.0   
5                     0.0                  0.0                  0.0   
6                     0.0                  0.0                  0.0   
7                     0.0                  0.0                  0.0   
8                     0.0                  0.0                  0.0   
9                     0.0                  0.0                  0.0   
10                    0.0                  0.0                  0.0   
11                    0.0                  0.0                  0.0   
12                    0.0                  0.0                  0.0   
13                    0.0                  0.0                  0.0   
14                    0.0                  0.0                  0.0   
15                    0.0                  0.0                  0.0   
16                    0.0                  0.0                  0.0   
17                    0.0                  0.0                  0.0   
18                    0.0                  0.0                  0.0   
19                    0.0                  0.0                  0.0   
20                    0.0                  0.0                  0.0   
21                    0.0                  0.0                  0.0   
22                    0.0                  0.0                  0.0   
23                    0.0                  0.0                  0.0   
24                    0.0                  0.0                  0.0   
25                    0.0                  0.0                  0.0   
26                    0.0                  0.0                  0.0   
27                    0.0                  0.0                  0.0   
28                    0.0                  0.0                  0.0   
29                    0.0                  0.0                  0.0   
...                   ...                  ...                  ...   
35460                 0.0                  0.0                  0.0   
35461                 0.0                  0.0                  0.0   
35462                 0.0                  0.0                  0.0   
35463                 0.0                  0.0                  0.0   
35464                 0.0                  0.0                  0.0   
35465                 0.0                  0.0                  0.0   
35466                 0.0                  0.0                  0.0   
35467                 0.0                  0.0                  0.0   
35468                 0.0                  0.0                  0.0   
35469                 0.0                  0.0                  0.0   
35470                 0.0                  0.0                  0.0   
35471                 0.0                  0.0                  0.0   
35472                 0.0                  0.0                  0.0   
35473                 0.0                  0.0                  0.0   
35474                 0.0                  0.0                  0.0   
35475                 0.0                  0.0                  0.0   
35476                 0.0                  0.0                  0.0   
35477                 0.0                  0.0                  0.0   
35478                 0.0                  0.0                  0.0   
35479                 0.0                  0.0                  0.0   
35480                 0.0                  0.0                  0.0   
35481                 0.0                  0.0                  0.0   
35482                 0.0                  0.0                  0.0   
35483                 0.0                  0.0                  0.0   
35484                 0.0                  0.0                  0.0   
35485                 0.0                  0.0                  0.0   
35486                 0.0                  0.0                  0.0   
35487                 0.0                  0.0                  0.0   
35488                 0.0                  0.0                  0.0   
35489                 0.0                  0.0                  0.0   

       C(nperps)[T.1500.0]  C(nperps)[T.2000.0]  C(nperps)[T.3000.0]  \
0                      0.0                  0.0                  0.0   
1                      0.0                  0.0                  0.0   
2                      0.0                  0.0                  0.0   
3                      0.0                  0.0                  0.0   
4                      0.0                  0.0                  0.0   
5                      0.0                  0.0                  0.0   
6                      0.0                  0.0                  0.0   
7                      0.0                  0.0                  0.0   
8                      0.0                  0.0                  0.0   
9                      0.0                  0.0                  0.0   
10                     0.0                  0.0                  0.0   
11                     0.0                  0.0                  0.0   
12                     0.0                  0.0                  0.0   
13                     0.0                  0.0                  0.0   
14                     0.0                  0.0                  0.0   
15                     0.0                  0.0                  0.0   
16                     0.0                  0.0                  0.0   
17                     0.0                  0.0                  0.0   
18                     0.0                  0.0                  0.0   
19                     0.0                  0.0                  0.0   
20                     0.0                  0.0                  0.0   
21                     0.0                  0.0                  0.0   
22                     0.0                  0.0                  0.0   
23                     0.0                  0.0                  0.0   
24                     0.0                  0.0                  0.0   
25                     0.0                  0.0                  0.0   
26                     0.0                  0.0                  0.0   
27                     0.0                  0.0                  0.0   
28                     0.0                  0.0                  0.0   
29                     0.0                  0.0                  0.0   
...                    ...                  ...                  ...   
35460                  0.0                  0.0                  0.0   
35461                  0.0                  0.0                  0.0   
35462                  0.0                  0.0                  0.0   
35463                  0.0                  0.0                  0.0   
35464                  0.0                  0.0                  0.0   
35465                  0.0                  0.0                  0.0   
35466                  0.0                  0.0                  0.0   
35467                  0.0                  0.0                  0.0   
35468                  0.0                  0.0                  0.0   
35469                  0.0                  0.0                  0.0   
35470                  0.0                  0.0                  0.0   
35471                  0.0                  0.0                  0.0   
35472                  0.0                  0.0                  0.0   
35473                  0.0                  0.0                  0.0   
35474                  0.0                  0.0                  0.0   
35475                  0.0                  0.0                  0.0   
35476                  0.0                  0.0                  0.0   
35477                  0.0                  0.0                  0.0   
35478                  0.0                  0.0                  0.0   
35479                  0.0                  0.0                  0.0   
35480                  0.0                  0.0                  0.0   
35481                  0.0                  0.0                  0.0   
35482                  0.0                  0.0                  0.0   
35483                  0.0                  0.0                  0.0   
35484                  0.0                  0.0                  0.0   
35485                  0.0                  0.0                  0.0   
35486                  0.0                  0.0                  0.0   
35487                  0.0                  0.0                  0.0   
35488                  0.0                  0.0                  0.0   
35489                  0.0                  0.0                  0.0   

       C(nperps)[T.5000.0]  region  specificity  
0                      0.0     1.0          1.0  
1                      0.0     1.0          1.0  
2                      0.0     1.0          1.0  
3                      0.0     1.0          1.0  
4                      0.0     1.0          1.0  
5                      0.0     1.0          1.0  
6                      0.0     1.0          1.0  
7                      0.0     1.0          1.0  
8                      0.0     1.0          1.0  
9                      0.0     1.0          1.0  
10                     0.0     1.0          1.0  
11                     0.0     3.0          1.0  
12                     0.0     1.0          1.0  
13                     0.0     1.0          1.0  
14                     0.0     1.0          1.0  
15                     0.0     1.0          1.0  
16                     0.0     1.0          1.0  
17                     0.0     1.0          1.0  
18                     0.0     1.0          1.0  
19                     0.0     1.0          1.0  
20                     0.0     1.0          1.0  
21                     0.0     1.0          1.0  
22                     0.0     1.0          1.0  
23                     0.0     1.0          1.0  
24                     0.0     1.0          1.0  
25                     0.0     1.0          1.0  
26                     0.0     1.0          1.0  
27                     0.0     1.0          1.0  
28                     0.0     1.0          1.0  
29                     0.0     1.0          1.0  
...                    ...     ...          ...  
35460                  0.0     5.0          3.0  
35461                  0.0     5.0          4.0  
35462                  0.0     6.0          1.0  
35463                  0.0    10.0          1.0  
35464                  0.0    10.0          1.0  
35465                  0.0     6.0          1.0  
35466                  0.0     6.0          1.0  
35467                  0.0     6.0          3.0  
35468                  0.0     5.0          1.0  
35469                  0.0     6.0          1.0  
35470                  0.0     6.0          3.0  
35471                  0.0     6.0          3.0  
35472                  0.0     6.0          1.0  
35473                  0.0     6.0          3.0  
35474                  0.0    10.0          1.0  
35475                  0.0    10.0          1.0  
35476                  0.0    10.0          1.0  
35477                  0.0    11.0          4.0  
35478                  0.0     6.0          3.0  
35479                  0.0     6.0          3.0  
35480                  0.0     6.0          3.0  
35481                  0.0    10.0          1.0  
35482                  0.0    10.0          1.0  
35483                  0.0    10.0          4.0  
35484                  0.0    10.0          2.0  
35485                  0.0    10.0          1.0  
35486                  0.0    10.0          4.0  
35487                  0.0     9.0          1.0  
35488                  0.0     8.0          1.0  
35489                  0.0    11.0          1.0  

[35490 rows x 150 columns]



In [24]:

    
y = np.ravel(y)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)
# what percentage had multiple?
print("Benchmark:")
b = y.mean()
print(b)
# check the accuracy on the training set
a = model.score(X, y)
print("Score:")
print(a)

model.coef_
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)
# predict class labels for the test set
predicted = model2.predict(X_test)
print (predicted)
# generate class probabilities
probs = model2.predict_proba(X_test)
print (probs)
# generate evaluation metrics
print (metrics.accuracy_score(y_test, predicted))
print (metrics.roc_auc_score(y_test, probs[:, 1]))
print (metrics.confusion_matrix(y_test, predicted))
print (metrics.classification_report(y_test, predicted))
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



Benchmark:
0.96111580727
Score:
0.971344040575
[ 1.  1.  1. ...,  1.  1.  1.]
[[ 0.01257728  0.98742272]
 [ 0.01640847  0.98359153]
 [ 0.01318311  0.98681689]
 ..., 
 [ 0.0046974   0.9953026 ]
 [ 0.00399588  0.99600412]
 [ 0.00653373  0.99346627]]
0.972386587771
0.909224276403
[[  222   187]
 [  107 10131]]
             precision    recall  f1-score   support

        0.0       0.67      0.54      0.60       409
        1.0       0.98      0.99      0.99     10238

avg / total       0.97      0.97      0.97     10647

[ 0.96365173  0.95886165  0.97351367  0.97210482  0.97295013  0.97633136
  0.97097774  0.96844181  0.97266836  0.96816005]
0.969766131305
AUC = 0.7662



In [ ]:

Logistic Regression - Success

Logistic Regression - MULTIPLE



In [25]:

    
y, X = dmatrices('multiple ~ C(imonth) + C(iday) + region + C(property) + C(propextent) + C(attacktype1) + C(weaptype1)+ C(nperps) + specificity', df, return_type="dataframe")



In [26]:

    
y = np.ravel(y)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)
# what percentage had multiple?
print("Benchmark:")
b = y.mean()
print(b)
# check the accuracy on the training set
a = model.score(X, y)
print("Score:")
print(a)

model.coef_
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)
# predict class labels for the test set
predicted = model2.predict(X_test)
print (predicted)
# generate class probabilities
probs = model2.predict_proba(X_test)
print (probs)
# generate evaluation metrics
print (metrics.accuracy_score(y_test, predicted))
print (metrics.roc_auc_score(y_test, probs[:, 1]))
print (metrics.confusion_matrix(y_test, predicted))
print (metrics.classification_report(y_test, predicted))
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predicted)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



Benchmark:
0.169709777402
Score:
0.831670893209
[ 0.  0.  0. ...,  0.  0.  0.]
[[ 0.85340288  0.14659712]
 [ 0.81828894  0.18171106]
 [ 0.78779969  0.21220031]
 ..., 
 [ 0.91212401  0.08787599]
 [ 0.81110209  0.18889791]
 [ 0.84036816  0.15963184]]
0.833192448577
0.657682260768
[[8859   10]
 [1766   12]]
             precision    recall  f1-score   support

        0.0       0.83      1.00      0.91      8869
        1.0       0.55      0.01      0.01      1778

avg / total       0.79      0.83      0.76     10647

[ 0.82619718  0.82901408  0.83042254  0.82952944  0.82981121  0.83009298
  0.83009298  0.8277903   0.82807215  0.82919955]
0.82902224368
AUC = 0.5028

RANDOM FOREST

Random Forest- MULTIPLE



In [31]:

    
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd



In [56]:

    
y = df['multiple']



In [57]:

    
X = df.filter(['imonth', 'iday', 'region','property',
       'propextent','attacktype1','weaptype1','nperps','specificity'])
Xone= pd.get_dummies(X, prefix='month', columns=['imonth'])
Xtwo= pd.get_dummies(Xone, prefix='day', columns=['iday'])
Xthree= pd.get_dummies(Xtwo, prefix='region', columns=['region'])
Xfour= pd.get_dummies(Xthree, prefix='attacktype', columns=['attacktype1'])
Xfive= pd.get_dummies(Xfour, prefix='weapontype', columns=['weaptype1'])
Xsix= pd.get_dummies(Xfive, prefix='specificity', columns=['specificity'])



In [ ]:



In [ ]:



In [58]:

    
features_train, features_test,target_train, target_test = train_test_split(Xsix,y, test_size = 0.2,random_state=0)



In [59]:

    
print("Benchmark: " )
print(1-(y.mean()))









    



Benchmark: 
0.830290222597915



In [60]:

    
#Random Forest
forest=RandomForestClassifier(n_estimators=10)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
forest.score(features_train, target_train )









    Out[60]:





0.93357283741899122



In [49]:

    
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, output)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



AUC = 0.7165



In [50]:

    
scores = cross_val_score(forest, X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())









    



[ 0.82056338  0.79070423  0.80957746  0.79120879  0.78472809  0.77571147
  0.79064525  0.75704622  0.77874859  0.78889515]
0.788782864043



In [ ]:



In [ ]:

Random Forest- SUCCESS



In [43]:

    
y = df['success']
X = df.filter(['imonth', 'iday', 'region','property',
       'propextent','attacktype1','weaptype1','nperps','specificity'])
features_train, features_test,target_train, target_test = train_test_split(X,y, test_size = 0.2,random_state=0)
#Random Forest
forest=RandomForestClassifier(n_estimators=10)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
score = forest.score(features_train, target_train)
print("Benchmark: " )
print((y.mean()))
print('Our Accuracy:')
print(score)
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, output)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



Benchmark: 
0.9611158072696534
Our Accuracy:
0.992180896027
AUC = 0.7688

Preventing Overfitting of the tree for multiple model

The results are different now due to the different sample used from here as compared to when we built the model shown during presentation; as such, results may vary slightly



In [53]:

    
from sklearn.tree import _tree

def leaf_depths(tree, node_id = 0):
    
     '''
     tree.children_left and tree.children_right store ids
     of left and right chidren of a given node
     '''
     left_child = tree.children_left[node_id]
     right_child = tree.children_right[node_id]

     '''
     If a given node is terminal, 
     both left and right children are set to _tree.TREE_LEAF
     '''
     if left_child == _tree.TREE_LEAF:
         
         '''
         Set depth of terminal nodes to 0
         '''
         depths = np.array([0])

     else:
         
         '''
         Get depths of left and right children and
         increment them by 1
         '''
         left_depths = leaf_depths(tree, left_child) + 1
         right_depths = leaf_depths(tree, right_child) + 1
 
         depths = np.append(left_depths, right_depths)
 
     return depths

def leaf_samples(tree, node_id = 0):
    
     left_child = tree.children_left[node_id]
     right_child = tree.children_right[node_id]

     if left_child == _tree.TREE_LEAF:
        
         samples = np.array([tree.n_node_samples[node_id]])

     else:
        
         left_samples = leaf_samples(tree, left_child)
         right_samples = leaf_samples(tree, right_child)

         samples = np.append(left_samples, right_samples)

     return samples

def draw_tree(ensemble, tree_id=0):

     plt.figure(figsize=(8,8))
     plt.subplot(211)

     tree = ensemble.estimators_[tree_id].tree_

     depths = leaf_depths(tree)
     plt.hist(depths, histtype='step', color='#9933ff', 
              bins=range(min(depths), max(depths)+1))

     plt.xlabel("Depth of leaf nodes (tree %s)" % tree_id)
    
     plt.subplot(212)
    
     samples = leaf_samples(tree)
     plt.hist(samples, histtype='step', color='#3399ff', 
              bins=range(min(samples), max(samples)+1))
    
     plt.xlabel("Number of samples in leaf nodes (tree %s)" % tree_id)
   
     plt.show()
    
def draw_ensemble(ensemble):

     plt.figure(figsize=(8,8))
     plt.subplot(211)

     depths_all = np.array([], dtype=int)

     for x in ensemble.estimators_:
         tree = x.tree_
         depths = leaf_depths(tree)
         depths_all = np.append(depths_all, depths)
         plt.hist(depths, histtype='step', color='#ddaaff', 
                  bins=range(min(depths), max(depths)+1))

     plt.hist(depths_all, histtype='step', color='#9933ff', 
              bins=range(min(depths_all), max(depths_all)+1), 
              weights=np.ones(len(depths_all))/len(ensemble.estimators_), 
              linewidth=2)
     plt.xlabel("Depth of leaf nodes")
    
     samples_all = np.array([], dtype=int)
    
     plt.subplot(212)
    
     for x in ensemble.estimators_:
         tree = x.tree_
         samples = leaf_samples(tree)
         samples_all = np.append(samples_all, samples)
         plt.hist(samples, histtype='step', color='#aaddff', 
                  bins=range(min(samples), max(samples)+1))
    
     plt.hist(samples_all, histtype='step', color='#3399ff', 
              bins=range(min(samples_all), max(samples_all)+1), 
              weights=np.ones(len(samples_all))/len(ensemble.estimators_), 
              linewidth=2)
     plt.xlabel("Number of samples in leaf nodes")
    
     plt.show()



In [61]:

    
draw_tree(forest)



In [62]:

    
draw_ensemble(forest)



In [64]:

    
y = df['multiple']
X = df.filter(['imonth', 'iday', 'region','property',
       'propextent','attacktype1','weaptype1','nperps','specificity'])
features_train, features_test,target_train, target_test = train_test_split(X,y, test_size = 0.2,random_state=0)
#Random Forest
forest=RandomForestClassifier(n_estimators=10, max_depth = 16)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
score = forest.score(features_train, target_train)
print("Benchmark: " )
print(1-(y.mean()))
print('Our Accuracy:')
print(score)
false_positive_rate, true_positive_rate, thresholds = roc_curve(target_test, output)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC = %0.4f'% roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()









    



Benchmark: 
0.830290222597915
Our Accuracy:
0.893209354748
AUC = 0.6205

ASSOCIATION RULES



In [ ]:

    
import pandas as pd



In [ ]:

    
df = pd.read_csv('/Users/Laishumin/Datasets/globalterrorism.csv', encoding='ISO-8859-1',low_memory=False)

clean=df[['iyear','imonth','iday','region','specificity'
        ,'vicinity','crit1','crit2','crit3','doubtterr','multiple','success','suicide'
        ,'attacktype1','ingroup','guncertain1','weaptype1']]



In [ ]:

    
df_dummies1= pd.get_dummies(clean, prefix='month', columns=['imonth'])



In [ ]:

    
df_dummies2= pd.get_dummies(df_dummies1, prefix='region', columns=['region'])



In [ ]:

    
df_dummies3= pd.get_dummies(df_dummies2, prefix='specificity', columns=['specificity'])



In [ ]:

    
df_dummies4= pd.get_dummies(df_dummies3, prefix='attack_type', columns=['attacktype1'])



In [ ]:

    
df_dummies5= pd.get_dummies(df_dummies4, prefix='main_weapon_type', columns=['weaptype1'])



In [ ]:

    
data = df_dummies5
del data['iyear']
del data['iday']
del data['guncertain1']
del data['ingroup']
del data['doubtterr']



In [ ]:

    
names = list(data.columns.values)
names



In [ ]:

    
lift_multiple = []
for i in names:
    num_Feature = 0
    Count = 0
    for sample in data1[i]:
        thing = data1[i].astype(str).str.contains('1')
        if (thing.iloc[Count] == True):  
            num_Feature += 1
            Count +=1
        else:
            Count +=1
    print("{0} ".format(num_Feature) + " from " + i)


    rule_valid = 0
    rule_invalid = 0
    for j in range(len(data1)):
        if data1.iloc[j][i] == 1:
            if data1.iloc[j].multiple == 1:
                rule_valid += 1
            else:
                rule_invalid += 1
    print("{0} cases of the rule being valid were discovered".format(rule_valid))
    print("{0} cases of the rule being invalid were discovered".format(rule_invalid))

    # Now we have all the information needed to compute Support and Confidence
    support = rule_valid  # The Support is the number of times the rule is discovered.
    if (num_Feature == 0):
        lift_multiple.append(0)

    else:
        confidence = (rule_valid) / (num_Feature) 
        lift = confidence / 0.13
        lift_multiple.append(lift)
    print(i + '-->Multiple')
    print("The support is {0}, the confidence is {1:.3f}, and the lift is {2:.3f}.".format(support, confidence, lift))
    print("As a percentage, the confidence is {0:.1f}%.".format(100 * confidence))
    print("-----------------------------------------------------------------")



In [ ]:

    
lift_multiple_pd = pd.DataFrame(
    {'Lift':lift_multiple
    },index=names2)

lift_multiple_pd



In [ ]:

    
graph = lift_multiple_pd.sort(['Lift'], ascending=[0])

graph



In [ ]:

    
%matplotlib inline
graph.plot(kind='bar')

Violin Plot Visualisastions



In [8]:

    
import numpy as np
import seaborn as sns
import pandas as pd
sns.violinplot(x="weaptype1", y="success", data=df, palette="Set3")









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x1170ef048>



In [9]:

    
sns.violinplot(x="propextent", y="multiple", data=df, palette="Set3")









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1170ef048>



In [10]:

    
sns.violinplot(x="imonth", y="multiple", data=df, palette="Set3")









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x1170ef048>



In [11]:

    
sns.violinplot(x="property", y="multiple", data=df, palette="Set3")









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x1170ef048>



In [ ]:



In [ ]:

	imonth	iday	region	property	propextent	attacktype1	weaptype1	nperps	success	specificity
5	1	1	1	1	3.0	2	5	-99.0	1	1
7	1	2	1	1	3.0	3	6	-99.0	1	1
8	1	2	1	1	3.0	7	8	1.0	1	1
9	1	3	1	1	3.0	7	8	1.0	1	1
11	1	6	1	1	3.0	7	8	-99.0	1	1